Load the data downloaded from the GeoMX DSP Initial_Dataset.xlsx
First we will load the data as-is and perform no automatic QC trimming or normalization
source("helpers.R")
annotations_data <- read_excel("../data/HuBMAP_ProteomeAtlas_InitialDataset.xlsx", sheet = 1)
counts_data <- read_excel("../data/HuBMAP_ProteomeAtlas_InitialDataset.xlsx", sheet = 2)
write.csv(counts_data, "../data/counts.csv", row.names = FALSE)
write.csv(annotations_data, "../data/annotations.csv", row.names = FALSE)
rawdata <- processNanostringData(nsFiles = "../data/counts.csv",
sampleTab = "../data/annotations.csv",
idCol = "SegmentDisplayName",
groupCol = "Segment_name",
normalization = "none")
##
## Loading count data......
##
## Averaging technical replicates.....
The purpose of AOI-level QC is to identify AOIs with poor data that should be removed. We should look at both signal strength and background.
First, we will compute 2 metrics of AOI technical performance: • Housekeeper geomean: this captures signal strength. • IgG geomean: this captures background (negative controls), but in most experiments also reflects signal strength, as AOIs with more on-target signal also have more background.
We will remove one AOI with an outlying (high) IgG geomean: P4/P5 Ins/Syto83/PanCK/CD31 7-9-24 | 036 | CD31+
data <-rawdata
# Extract the necessary data
exprs_data <- exprs(data)
sample_data <- pData(data)
# Define the housekeeper genes names
hk_genes <- c("GAPDH", "RPS6", "Histone H3")
# Calculate the geometric mean for housekeeper genes
geometric_mean <- function(x) {
# Remove NA values and non-positive values that would cause issues with log
x <- x[x > 0]
if (length(x) == 0) {
return(NA)
}
exp(mean(log(x)))
}
hk_geomeans <- apply(exprs_data[hk_genes, ], 2, geometric_mean)
# Create a data frame for plotting
plot_data <- data.frame(
Sample = colnames(exprs_data),
HK_Geomean = log2(hk_geomeans),
Scan_Label = sample_data$ScanLabel
)
# Plot the barplot with no x-axis labels and "Individual AOIs" as the x-axis title
ggplot(plot_data, aes(x = Sample, y = HK_Geomean, fill = Scan_Label)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(x = "Individual AOIs", y = "Housekeeper Geomean", fill = "Scan Label") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
data <- rawdata
# Extract expression data
exprs_data <- exprs(data)
# Extract sample data
sample_data <- pData(data)
# Find the IgG genes by looking for "IgG" in the row names
igG_genes <- rownames(exprs_data)[grepl("IgG", rownames(exprs_data))]
# Function to calculate geometric mean
geometric_mean <- function(x) {
# Remove NA values and non-positive values
x <- x[x > 0]
if (length(x) == 0) {
return(NA)
}
exp(mean(log(x)))
}
# Calculate the geometric mean for IgG genes for each sample
igG_geomeans <- apply(exprs_data[igG_genes, ], 2, geometric_mean)
# Create a data frame for plotting
plot_data <- data.frame(
Sample = colnames(exprs_data),
IgG_Geomean = igG_geomeans,
Scan_Label = sample_data$ScanLabel
)
# Create the bar plot for IgG genes
ggplot(plot_data, aes(x = Sample, y = IgG_Geomean, fill = Scan_Label)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(x = "Individual AOIs", y = "IgG Geomean", fill = "Scan Label") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
sample_names <- colnames(exprs(rawdata))
# Exclude the sample "P4P5_InsSyto83PanCKCD31_7924_036_CD31"
rawdata <- rawdata[, !sample_names %in% "P4P5_InsSyto83PanCKCD31_7924_036_CD31"]
Technical QC assessment includes Minimum nuclei count and Minimum surface area
QC_data <- processNanostringData(nsFiles = "../data/counts.csv",
sampleTab = "../data/annotations.csv",
idCol = "SegmentDisplayName",
groupCol = "Segment_name",
normalization = "RUVIII",
skip.housekeeping = TRUE,
housekeeping = c("GAPDH", "RPS6", "Histone H3"),
output.format = "list")
##
## Loading count data......
## Warning in read_sampleData(dat, file.name = sampleTab, idCol = idCol, groupCol = groupCol, :
## Sample names in the two files don't match. NanoTube is
## assuming that samples are in the same order. Please confirm
## with your data.
# Set the thresholds from the GeoMX DSP Analysis Suite
nuclei_count_min <- 20
surface_area_min <- 1600
# Create QC flags
QC_data$samples$NucleiCount_QC_Flag <- QC_data$samples$AOINucleiCount < nuclei_count_min
QC_data$samples$SurfaceArea_QC_Flag <- QC_data$samples$AOISurfaceArea < surface_area_min
# Assuming QC_data is your data frame with sample data and QC flags
# We'll filter the data for any rows with at least one TRUE flag
QC_flagged_data <- QC_data$samples %>%
filter(NucleiCount_QC_Flag | SurfaceArea_QC_Flag) %>%
select(SegmentDisplayName, NucleiCount_QC_Flag, SurfaceArea_QC_Flag)
# Print the table using kable in R Markdown
kable(QC_flagged_data,row.names = FALSE)
| SegmentDisplayName | NucleiCount_QC_Flag | SurfaceArea_QC_Flag |
|---|
#save to a list
QC_flagged_segments <- QC_flagged_data$SegmentDisplayName
Probes that never rise above background should be interpreted carefully. The plot below shows a convenient way to identify poorly-performing probes.
Here we compute and plot the “signal-tobackground” ratio per target, which is each AOI’s data divided by its IgG geomean.
data <- rawdata
# Extract expression data
exprs_data <- exprs(data)
# Calculate the geometric mean for IgG controls
igG_controls <- rownames(exprs_data)[grepl("IgG", rownames(exprs_data))]
geometric_mean <- function(x) {
x <- x[x > 0]
if (length(x) == 0) {
return(NA)
}
exp(mean(log(x)))
}
igG_geomeans <- apply(exprs_data[igG_controls, ], 2, geometric_mean)
# Compute signal-to-background ratio for each target
signal_to_background <- sweep(exprs_data, 2, igG_geomeans, FUN="/")
log2_signal_to_background <- log2(signal_to_background)
# Convert to long format for plotting
long_data <- melt(log2_signal_to_background)
# Order the features by whether they are IgG controls and their median signal-to-background ratio
features_ordered <- long_data %>%
group_by(Var1) %>%
summarize(is_igG = any(Var1 %in% igG_controls), median_value = median(value, na.rm = TRUE), .groups = 'drop') %>%
arrange(is_igG, median_value) %>%
pull(Var1)
# Update the variable factor levels to match the order
long_data$variable <- factor(long_data$Var1, levels = features_ordered)
# Create the boxplot
boxplot <- ggplot(long_data, aes(x = variable, y = value)) +
geom_boxplot(outlier.shape = NA, color = "black") + # Black boxes without outliers
geom_jitter(color = "red", width = 0.2, size = 0.5) + # All points in red with smaller size
geom_hline(yintercept = 0, linetype = "solid") + # Add horizontal line at y=0
geom_vline(xintercept = 3 + 0.5, linetype = "dashed") + # Correct vertical line position
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "", y = "Log2 Signal-to-Background Ratio")
# Print the plot
print(boxplot)
No segments failed segment QC, so we do not need to remove anything
#Define segments to keep
QC_passed_segments <- rawdata$SegmentDisplayName[!rawdata$SegmentDisplayName %in% QC_flagged_segments]
data_filtered <- rawdata[,QC_passed_segments]
For now, we will leave all the probes in the dataset even though some have below background signal. We will make a list of these so that we can proceed with caution when evaluating results for these genes
library(dplyr)
# Define a cutoff for poorly performing probes (log2 ratio < 0)
cutoff <- 0 # You can adjust this based on your criteria
# Calculate median log2 signal-to-background ratio for each probe
probe_performance <- long_data %>%
group_by(Var1) %>%
summarize(median_value = median(value, na.rm = TRUE), .groups = 'drop')
# Identify poorly-performing probes where the median log2 ratio is below the cutoff
poor_probes <- probe_performance %>%
filter(median_value < cutoff)
# Print the list of poorly-performing probes
print(as.character(poor_probes$Var1)[-grep("Ig",as.character(poor_probes$Var1))])
## [1] "STAT6 (phospho Y641)"
## [2] "UBE2C"
## [3] "Bcl-2"
## [4] "Smad2 (phospho S255)"
## [5] "ICAM1"
## [6] "Ikaros"
## [7] "CD18"
## [8] "PD-L1"
## [9] "MICA + MICB"
## [10] "CDKN2A/p14ARF"
## [11] "Oct-2"
## [12] "FANCI"
## [13] "CD39"
## [14] "T-bet / Tbx21"
## [15] "S100 beta"
## [16] "FOXP3"
## [17] "C4a"
## [18] "beta 2 Microglobulin"
## [19] "AKT1 (phospho T450)"
## [20] "Bim"
## [21] "TGF beta 1"
## [22] "CXCL10/IP-10"
## [23] "Syk (phospho Y352) +ZAP70 (phospho Y319)"
## [24] "CD3G"
## [25] "Oct4"
## [26] "RUNX1 / AML1 + RUNX3 + RUNX2"
## [27] "TLR8"
## [28] "Aldolase B+Aldolase C"
## [29] "CD15"
## [30] "CTLA4"
## [31] "CCR7"
## [32] "CD134 / OX40L receptor"
## [33] "CDKN2A/p16INK4a"
## [34] "Cyclin B1"
## [35] "Musashi 1 / Msi1"
## [36] "CD38"
## [37] "Mucin 5AC"
## [38] "CD66b"
## [39] "ENO1 + ENO2 + ENO3"
## [40] "Carbonic Anhydrase 3/CA3"
## [41] "CD276"
## [42] "Wilms Tumor Protein"
## [43] "Fas"
## [44] "FGF2"
## [45] "CD27"
## [46] "iNOS"
## [47] "CDK1 + CDK2 + CDK3 + CDK5 (phospho Y15)"
## [48] "KMT6 / EZH2"
## [49] "CD44v6"
## [50] "CD163"
# Save the poorly-performing probes to a CSV file
write.csv(poor_probes, "../results/poorly_performing_probes.csv", row.names = FALSE)
Options for normalization include housekeeping, negative control normalization, background correction, and scale to area or nuclei count.
It is not recommended to use multiple of these options, but instead to explore the data and choose one approach to normalize.
Do the IgG’s have high enough counts to be used for normalization?
These IgGs measure background, which can be a normalization method to compare slides and/or cell populations
library(Biobase)
data_filtered <- rawdata
# Extract the necessary data
exprs_data <- exprs(data_filtered)
feature_data <- fData(data_filtered)
sample_data <- pData(data_filtered)
# Find the IgG control feature names
ig_controls <- rownames(feature_data)[grepl("IgG", rownames(feature_data))]
# Filter out only IgG controls data
ig_controls_data <- exprs_data[ig_controls, ]
# Create pairs of IgG controls for plotting
ig_pairs <- combn(ig_controls, 2, simplify = FALSE)
# Create a list to store ggplot objects
plots <- list()
# Loop over each pair and create a scatter plot
for (i in seq_along(ig_pairs)) {
pair <- ig_pairs[[i]]
# Prepare the data for plotting
plot_data <- data.frame(
#x = as.numeric(ig_controls_data[pair[1], ]),
#y = as.numeric(ig_controls_data[pair[2], ]),
x = ig_controls_data[pair[1], ],
y = ig_controls_data[pair[2], ],
# Sample = colnames(ig_controls_data),
Scan_Name=sample_data$ScanLabel
)
# Create the plot
p <- ggplot(plot_data, aes(x = x, y = y)) +
geom_point(aes(color = Scan_Name)) +
geom_smooth(method = "lm", se = FALSE) +
labs(x = pair[1], y = pair[2], title = paste("Scatter plot of", pair[1], "vs", pair[2])) +
theme_minimal()
# Add the plot to the list
plots[[i]] <- p
}
# Print the plots
print(plots)
## [[1]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[2]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[3]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[4]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[5]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[6]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[7]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[8]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[9]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[10]]
## `geom_smooth()` using formula = 'y ~ x'
All the IgG counts are high enough to use negative control normalization if desired.
Assumes the housekeepers measure primarily signal strength
Housekeepers should be highly correlated (with consistent ratios between them)
# Extract the necessary data
exprs_data <- exprs(data_filtered)
feature_data <- fData(data_filtered)
sample_data <- pData(data_filtered)
# List of housekeeper genes
housekeepers <- c("GAPDH", "RPS6", "Histone H3")
# Ensure that the housekeeper genes are present in the dataset
if (!all(housekeepers %in% rownames(feature_data))) {
stop("Not all housekeeper genes are present in the dataset.")
}
# Filter out only housekeeper genes data
hk_data <- exprs_data[housekeepers, ]
# Create pairs of housekeeper genes for plotting
hk_pairs <- combn(housekeepers, 2, simplify = FALSE)
# Create a list to store ggplot objects and correlation coefficients
plots <- list()
correlations <- data.frame(Pair = character(), R_squared = numeric(), stringsAsFactors = FALSE)
# Loop over each pair and create a scatter plot with a linear fit
for (i in seq_along(hk_pairs)) {
pair <- hk_pairs[[i]]
# Prepare the data for plotting
plot_data <- data.frame(
x = as.numeric(hk_data[pair[1], ]),
y = as.numeric(hk_data[pair[2], ]),
Sample = colnames(hk_data),
Scan_Name=sample_data$ScanLabel
)
# Compute the correlation coefficient
cor_coefficient <- cor(plot_data$x, plot_data$y)
# Create the plot
p <- ggplot(plot_data, aes(x = x, y = y)) +
geom_point(aes(color = Scan_Name)) +
geom_smooth(method = "lm", se = FALSE) +
labs(x = pair[1], y = pair[2], title = paste("Scatter plot of", pair[1], "vs", pair[2]),
subtitle = paste("R^2:", round(cor_coefficient^2, digits = 3))) +
theme_minimal()
# Add the plot and correlation to the lists
plots[[i]] <- p
correlations <- rbind(correlations, data.frame(Pair = paste(pair, collapse = " vs "), R_squared = cor_coefficient^2))
}
# Print the plots and correlations
print(plots)
## [[1]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[2]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[3]]
## `geom_smooth()` using formula = 'y ~ x'
print(correlations)
## Pair R_squared
## 1 GAPDH vs RPS6 0.4751740
## 2 GAPDH vs Histone H3 0.1147531
## 3 RPS6 vs Histone H3 0.3675943
Housekeepers do not exhibit consistent and high correlations in this dataset
Check how the different variables perform with respect to each other before normalization. This also let’s use assess whether scaling to area and nuclei counts is a viable approach.
#define function to calculate geometric mean
geometric_mean <- function(x) {
# Remove NA values and non-positive values that would cause issues with log
x <- x[x > 0]
if (length(x) == 0) {
return(NA)
}
exp(mean(log(x)))
}
# Extract the necessary data
exprs_data <- exprs(data_filtered)
feature_data <- fData(data_filtered)
sample_data <- pData(data_filtered)
# Assuming geomeans for IgGs and housekeepers are calculated or can be accessed
# If not calculated, you would need to add code to calculate these from the exprs_data
# Access the area variable
area <- sample_data$AOISurfaceArea
nuclei <- sample_data$AOINucleiCount
# Geomeans for IgGs
ig_controls <- rownames(feature_data)[grepl("IgG", rownames(feature_data))]
ig_geomean <- apply(exprs_data[ig_controls, ], 2, geometric_mean) # User needs to define geometric_mean or replace it with correct function
# Geomeans for housekeepers
housekeepers <- c("GAPDH", "RPS6", "Histone H3")
hk_geomean <- apply(exprs_data[housekeepers, ], 2, geometric_mean) # As above
# Create a combined data frame for plotting
consistency_data <- data.frame(
Sample = colnames(exprs_data),
IgG_Geomean = ig_geomean,
HK_Geomean = hk_geomean,
Area = area,
Nuclei= nuclei,
Scan_Name=sample_data$ScanLabel
)
# Plotting the relationships
p1 <- ggplot(consistency_data, aes(x = IgG_Geomean, y = HK_Geomean)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
theme_minimal() +
geom_point(aes(color = Scan_Name))+
ggtitle("IgG vs Housekeeper Geomeans")
p2 <- ggplot(consistency_data, aes(x = HK_Geomean, y = Area)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
theme_minimal() +
geom_point(aes(color = Scan_Name))+
ggtitle("Housekeeper Geomeans vs Area")
p3 <- ggplot(consistency_data, aes(x = IgG_Geomean, y = Area)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
theme_minimal() +
geom_point(aes(color = Scan_Name))+
ggtitle("IgG Geomeans vs Area")
p4 <- ggplot(consistency_data, aes(x = IgG_Geomean, y = Nuclei)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
theme_minimal() +
geom_point(aes(color = Scan_Name))+
ggtitle("IgG Geomeans vs Nuclei count")
p5 <- ggplot(consistency_data, aes(x = HK_Geomean, y = Nuclei)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
theme_minimal() +
geom_point(aes(color = Scan_Name))+
ggtitle("Housekeeper Geomeans vs Nuclei count")
p6 <- ggplot(consistency_data, aes(x = Area, y = Nuclei)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
theme_minimal() +
geom_point(aes(color = Scan_Name))+
ggtitle("Area vs Nuclei count")
# Print the plots
print(p1)
print(p2)
print(p3)
print(p4)
print(p5)
print(p6)
Based on these results, we can consider background normlalization or scaling to nuceli count or area.